german <- read.csv("germancredit.csv", header=T)
head(german)[1:7]
## Default checkingstatus1 duration history purpose amount savings
## 1 0 A11 6 A34 A43 1169 A65
## 2 1 A12 48 A32 A43 5951 A61
## 3 0 A14 12 A34 A46 2096 A61
## 4 0 A11 42 A32 A42 7882 A61
## 5 1 A11 24 A33 A40 4870 A61
## 6 0 A14 36 A32 A46 9055 A65
Below I am making dummy variables (or one-hot encodings) of the categorical variables. This will be used for many circumstances where qualitative data isn’t allowed. For instance, performing PCA, or SMOTE. In James Gareth’s book “An Introduction To Statistical Learning” (ISLR), the authors discuss one-hot encoding in Chapter 6, specifically in the context of linear regression models. In Section 6.2.2, the authors explain that when one-hot encoding a categorical variable with k levels, one should create k-1 binary variables. The reason for this is to avoid perfect multicollinearity in the model matrix, which can cause problems when fitting linear regression models.
# One-Hot Encoding Without Removing the First Column
dummy <- dummyVars(" ~ .", data=german)
dummy.german <- data.frame(predict(dummy, newdata = german))
head(dummy.german)[1:4]
## Default checkingstatus1A11 checkingstatus1A12 checkingstatus1A13
## 1 0 1 0 0
## 2 1 0 1 0
## 3 0 0 0 0
## 4 0 1 0 0
## 5 1 1 0 0
## 6 0 0 0 0
Below is one-hot-encoding while removing the first column.
# Drop the first ('df' = 'drop first') column using the select() function
dummy.german.df <- subset(dummy.german, select=-c(
checkingstatus1A11, historyA30, purposeA40,
savingsA61, employA71, statusA91, othersA101,
propertyA121, otherplansA141, housingA151,
jobA171, teleA191, foreignA201
))
head(dummy.german.df)[1:4]
## Default checkingstatus1A12 checkingstatus1A13 checkingstatus1A14
## 1 0 0 0 0
## 2 1 1 0 0
## 3 0 0 0 1
## 4 0 0 0 0
## 5 1 0 0 0
## 6 0 0 0 1
# Counting Just the rows that did not Default (Good!)
default.0 <- dim(german[german$Default == 0, ])[1]
default.1 <- dim(german[german$Default == 1, ])[1]
paste("The number of those that did not default are", default.0, "and those that did default are", default.1)
## [1] "The number of those that did not default are 700 and those that did default are 300"
Balance.Plot <- function(data) {
class_counts <- data.frame(table(data$Default))
class_counts <- as_tibble(class_counts)
class_counts$class <- c("Default 0", "Default 1")
ggplot(class_counts, aes(x = class, y = Freq, fill=as.factor(class))) +
geom_bar(stat = "identity") +
labs(x = "Class", y = "Count", title = "Class Counts") +
scale_fill_manual(values=c("#F8766D", "#00BFC4")) +
theme_minimal()
}
Balance.Plot(german)
# Apply SMOTE to balance the dataset
balanced.dummy.german <- ROSE(Default ~ ., data = dummy.german, seed = 123)$data
Balance.Plot(balanced.dummy.german)
Count.Plot <- function(data, column.name) {
german.0.default <- data[data$Default == 0, ]
german.1.default <- data[data$Default == 1, ]
checkingstatus.counts.0 <- table(german.0.default[column.name])
checkingstatus.counts.1 <- table(german.1.default[column.name])
counts.df.0 <- as.data.frame(checkingstatus.counts.0)
counts.df.1 <- as.data.frame(checkingstatus.counts.1)
colnames(counts.df.0) <- c(column.name, "count")
colnames(counts.df.1) <- c(column.name, "count")
counts.df.0 <- counts.df.0 %>% mutate(class = 0)
counts.df.1 <- counts.df.1 %>% mutate(class = 1)
# Combine the two data frames
combined_df <- rbind(counts.df.0, counts.df.1)
# Create the plot
ggplot(combined_df, aes(x=combined_df[,column.name], y=count, fill=as.factor(class))) +
geom_bar(stat="identity", position="dodge") +
labs(x = column.name, y="Count", fill="Class") +
scale_fill_manual(values=c("#F8766D", "#00BFC4")) +
theme_minimal()
}
Count.Plot(german, "checkingstatus1")
Count.Plot(german, "history")
Count.Plot(german, "purpose")
Count.Plot(german, "savings")
Count.Plot(german, "employ")
Density.Plot <- function(data, column.name) {
# Create subsets of the dataframe based on the binary class
df_default0 <- data[data[["Default"]] == 0,]
df_default1 <- data[data[["Default"]] == 1,]
# Plot the two density plots on the same plot
ggplot() +
geom_density(data = df_default0, aes(x = df_default0[,column.name], fill = "Default 0"), alpha = 0.5) +
geom_density(data = df_default1, aes(x = df_default1[,column.name], fill = "Default 1"), alpha = 0.5) +
labs(title = paste("Distribution of", column.name, "by Default"),
x = column.name,
y = "Density") +
scale_fill_manual(values = c("#F8766D", "#00BFC4"), name = "Default") +
theme_minimal()
}
Density.Plot(german, "duration")
Density.Plot(german, "amount")
Density.Plot(german, "installment")
Density.Plot(german, "residence")
Density.Plot(german, "age")
Density.Plot(german, "cards")
Density.Plot(german, "liable")
RFE <- function(data, num.features=4) {
# Define the predictor and response variables
train.X <- data[, !(names(data) %in% c("Default"))]
train.Y <- as.factor(data[, "Default"])
# Define the control parameters for feature selection
ctrl <- rfeControl(functions = rfFuncs,
method = "cv",
number = 10)
# Perform recursive feature elimination using the random forest algorithm
rf_rfe <- rfe(train.X, train.Y, sizes = c(1:num.features), rfeControl = ctrl)
# Print the results
print(rf_rfe)
# Plot the results
p <- plot(rf_rfe, type = c("g", "o"))
print(p)
# Get the features
features <- row.names(varImp(rf_rfe))[1:num.features]
varimp_data <- data.frame(feature = features,
importance = varImp(rf_rfe)[1:num.features, 1])
# Plots the variable importances
gg <- ggplot(data = varimp_data,
aes(x = reorder(feature, -importance), y = importance, fill = feature)) +
geom_bar(stat="identity") + labs(x = "Features", y = "Variable Importance") +
geom_text(aes(label = round(importance, 2)), vjust=1.6, color="white", size=4) +
theme_bw() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
print(gg)
target.features <- c("Default", features)
return(subset(data, select=target.features))
}
RFE.4.German <- RFE(german)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.689 0.03178 0.02424 0.06864
## 2 0.710 0.21972 0.03300 0.07790
## 3 0.742 0.30027 0.03967 0.09527
## 4 0.737 0.32986 0.04270 0.10282
## 20 0.778 0.40747 0.03425 0.09421 *
##
## The top 5 variables (out of 20):
## checkingstatus1, duration, history, amount, savings
RFE.10.German <- RFE(german, num.features=10)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.691 0.03012 0.02025 0.06453
## 2 0.698 0.19492 0.04566 0.11076
## 3 0.733 0.27972 0.03743 0.10687
## 4 0.712 0.25745 0.05692 0.13524
## 5 0.733 0.30849 0.04423 0.09798
## 6 0.762 0.38809 0.05412 0.13560
## 7 0.754 0.36673 0.05641 0.14819
## 8 0.766 0.39372 0.03502 0.08374
## 9 0.762 0.38769 0.03853 0.09649
## 10 0.765 0.38995 0.03979 0.10783
## 20 0.768 0.38077 0.03553 0.08872 *
##
## The top 5 variables (out of 20):
## checkingstatus1, duration, history, amount, savings
RFE.4.Dummy.German <- RFE(dummy.german)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.700 0.00000 0.00000 0.00000
## 2 0.689 0.01123 0.03510 0.04459
## 3 0.710 0.10825 0.03197 0.08230
## 4 0.720 0.24240 0.03682 0.09612
## 61 0.762 0.34407 0.03615 0.10294 *
##
## The top 5 variables (out of 61):
## checkingstatus1A14, checkingstatus1A11, duration, historyA34, amount
RFE.2.Dummy.German <- RFE(dummy.german, num.features=2)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.700 0.000000 0.000000 0.000000
## 2 0.698 0.001235 0.006325 0.003904
## 61 0.762 0.353586 0.031903 0.090578 *
##
## The top 5 variables (out of 61):
## checkingstatus1A14, checkingstatus1A11, duration, historyA34, amount
RFE.10.Dummy.German <- RFE(dummy.german, num.features=10)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.700 0.00000 0.00000 0.00000
## 2 0.704 0.02697 0.01265 0.08527
## 3 0.719 0.13360 0.01792 0.07603
## 4 0.720 0.23540 0.03464 0.12233
## 5 0.717 0.21156 0.02869 0.10991
## 6 0.729 0.25947 0.02514 0.09212
## 7 0.742 0.29115 0.03084 0.11422
## 8 0.743 0.30061 0.02869 0.09993
## 9 0.731 0.29040 0.02514 0.08336
## 10 0.738 0.30251 0.02251 0.06384
## 61 0.767 0.37131 0.04322 0.12010 *
##
## The top 5 variables (out of 61):
## checkingstatus1A14, checkingstatus1A11, duration, historyA34, amount
RFE.10.Balanced.Dummy.German <- RFE(balanced.dummy.german, num.features = 10)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.5781 0.1562 0.06540 0.13045
## 2 0.6580 0.3159 0.03272 0.06569
## 3 0.7149 0.4300 0.04979 0.09990
## 4 0.7670 0.5342 0.03854 0.07732
## 5 0.7809 0.5618 0.03886 0.07782
## 6 0.8030 0.6058 0.05592 0.11185
## 7 0.8130 0.6258 0.04714 0.09424
## 8 0.8270 0.6538 0.04705 0.09406
## 9 0.8299 0.6596 0.04562 0.09118
## 10 0.8249 0.6496 0.04901 0.09807
## 61 0.8510 0.7021 0.04429 0.08862 *
##
## The top 5 variables (out of 61):
## checkingstatus1A14, foreignA201, historyA30, purposeA48, foreignA202
# Standardize the data
dummy.german_std <- scale(dummy.german)
# Perform PCA
german.pca <- prcomp(dummy.german_std, center = TRUE, scale. = TRUE)
# Extract the standard deviations of each principal component
sd <- summary(german.pca)$sdev
# Plot the standard deviations as a line plot
plot(sd, type = "b", xlab = "Principle Component", ylab = "Standard Deviation")
pov <- german.pca$sdev^2/sum(german.pca$sdev^2)
plot(pov, type = "b", xlab = "Principal Component", ylab = "Proportion of Variance")
# Standardize the data
dummy.german_std <- scale(balanced.dummy.german)
# Perform PCA
german.balanced.pca <- prcomp(dummy.german_std, center = TRUE, scale. = TRUE)
# Extract the standard deviations of each principal component
sd <- summary(german.balanced.pca)$sdev
# Plot the standard deviations as a line plot
plot(sd, type = "b", xlab = "Principle Component", ylab = "Standard Deviation")
pov <- german.balanced.pca$sdev^2/sum(german.balanced.pca$sdev^2)
plot(pov, type = "b", xlab = "Principal Component", ylab = "Proportion of Variance")
NB.Model.PCA <- function(pca, data) {
# Extract the principal component scores
pc_scores <- predict(pca, data)
# Split the data into training and testing sets on PCA
set.seed(123) # for reproducibility
train_index <- sample(nrow(data), nrow(data) * 0.7) # 70% for training
train_data <- pc_scores[train_index, ]
train_label <- as.factor(data[train_index,"Default"])
test_data <- pc_scores[-train_index, ]
test_label <- as.factor(data[-train_index,"Default"])
# Train the Naïve Bayes classifier using the training data
nb_model <- naive_bayes(train_data, train_label)
# # Evaluate the model performance on the test set for each value of K
nb_pred <- predict(nb_model, newdata=test_data)
nbConf <- confusionMatrix(nb_pred, test_label)
nbPredictiction <- prediction(as.numeric(nb_pred), as.numeric(test_label))
nbPerf <- performance(nbPredictiction, measure = "tpr", x.measure = "fpr")
nbAUC <- performance(nbPredictiction, measure = "auc")
print(plot(nbPerf))
# Extract performance metrics
sensitivity <- slot(nbPerf, "y.values")[[1]]
specificity <- 1 - slot(nbPerf, "x.values")[[1]]
auc <- slot(nbAUC, "y.values")
nbError <- mean(as.numeric(nb_pred) !=as.numeric(test_label))
# Print performance metrics
print(nbConf)
print(paste0("Sensitivity: ", sensitivity))
print(paste0("Specificity: ", specificity))
print(paste0("AUC: ", auc))
print(paste0("Error rate:", nbError))
# Calculate false positives
false_positives <- sum(as.numeric(nb_pred) == 2 & as.numeric(test_label) == 1)
# Calculate false positives as a percentage
total_negatives <- sum(as.numeric(test_label) == 1)
false_positives_percent <- false_positives / total_negatives * 100
# Print the false positives percentage
print(paste0("False positives percentage: ", round(false_positives_percent,3), "%"))
return(false_positives_percent)
}
# This function gets the first number of Principle Components.
# It requires a pca object and the number of PC's desired.
Get.First.n.PCs <- function(pca.data, n=4) {
# Extract the first four principal components
first_n_pcs <- pca.data$x[, 1:n]
# Create a new PCA results object with only the first four principal components
german.pca.first.n <- pca.data
german.pca.first.n$x <- first_n_pcs
german.pca.first.n$rotation <- pca.data$rotation[, 1:n, drop = FALSE]
return(german.pca.first.n)
}
NB.Model <- function(data) {
# Split the data into training and testing sets on PCA
set.seed(123) # for reproducibility
train_index <- sample(nrow(data), nrow(data) * 0.7) # 70% for training
train_data <- data[train_index, ]
train_label <- as.factor(data[train_index,"Default"])
test_data <- data[-train_index, ]
test_label <- as.factor(data[-train_index,"Default"])
# Train the Naïve Bayes classifier using the training data
nb_model <- naive_bayes(train_data, train_label)
# # Evaluate the model performance on the test set for each value of K
nb_pred <- predict(nb_model, newdata=test_data)
nbConf <- confusionMatrix(nb_pred, test_label)
nbPredictiction <- prediction(as.numeric(nb_pred), as.numeric(test_label))
nbPerf <- performance(nbPredictiction, measure = "tpr", x.measure = "fpr")
nbAUC <- performance(nbPredictiction, measure = "auc")
print(plot(nbPerf))
# Extract performance metrics
sensitivity <- slot(nbPerf, "y.values")[[1]]
specificity <- 1 - slot(nbPerf, "x.values")[[1]]
auc <- slot(nbAUC, "y.values")
nbError <- mean(as.numeric(nb_pred) !=as.numeric(test_label))
# Print performance metrics
print(nbConf)
print(paste0("Sensitivity: ", sensitivity))
print(paste0("Specificity: ", specificity))
print(paste0("AUC: ", auc))
print(paste0("Error rate:", nbError))
# Calculate false positives
false_positives <- sum(as.numeric(nb_pred) == 2 & as.numeric(test_label) == 1)
# Calculate false positives as a percentage
total_negatives <- sum(as.numeric(test_label) == 1)
false_positives_percent <- false_positives / total_negatives * 100
# Print the false positives percentage
print(paste0("False positives percentage: ", round(false_positives_percent,3), "%"))
return(false_positives_percent)
}
NB.Model.PCA(german.pca, dummy.german_std)
## NULL
## Confusion Matrix and Statistics
##
## Reference
## Prediction -0.985603470561783 1.01359221009092
## -0.985603470561783 145 5
## 1.01359221009092 4 146
##
## Accuracy : 0.97
## 95% CI : (0.9438, 0.9862)
## No Information Rate : 0.5033
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.94
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9732
## Specificity : 0.9669
## Pos Pred Value : 0.9667
## Neg Pred Value : 0.9733
## Prevalence : 0.4967
## Detection Rate : 0.4833
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9700
##
## 'Positive' Class : -0.985603470561783
##
## [1] "Sensitivity: 0" "Sensitivity: 0.966887417218543"
## [3] "Sensitivity: 1"
## [1] "Specificity: 1" "Specificity: 0.973154362416107"
## [3] "Specificity: 0"
## [1] "AUC: 0.970020889817325"
## [1] "Error rate:0.03"
## [1] "False positives percentage: 2.685%"
## [1] 2.684564
german.pca.first.four <- Get.First.n.PCs(german.pca, 4)
NB.Model.PCA(german.pca.first.four, dummy.german_std)
## NULL
## Confusion Matrix and Statistics
##
## Reference
## Prediction -0.985603470561783 1.01359221009092
## -0.985603470561783 111 39
## 1.01359221009092 38 112
##
## Accuracy : 0.7433
## 95% CI : (0.69, 0.7918)
## No Information Rate : 0.5033
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.4867
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.7450
## Specificity : 0.7417
## Pos Pred Value : 0.7400
## Neg Pred Value : 0.7467
## Prevalence : 0.4967
## Detection Rate : 0.3700
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.7433
##
## 'Positive' Class : -0.985603470561783
##
## [1] "Sensitivity: 0" "Sensitivity: 0.741721854304636"
## [3] "Sensitivity: 1"
## [1] "Specificity: 1" "Specificity: 0.74496644295302"
## [3] "Specificity: 0"
## [1] "AUC: 0.743344148628828"
## [1] "Error rate:0.256666666666667"
## [1] "False positives percentage: 25.503%"
## [1] 25.50336
german.pca.first.nine <- Get.First.n.PCs(german.pca, 9)
NB.Model.PCA(german.pca.first.nine, dummy.german_std)
## NULL
## Confusion Matrix and Statistics
##
## Reference
## Prediction -0.985603470561783 1.01359221009092
## -0.985603470561783 116 34
## 1.01359221009092 33 117
##
## Accuracy : 0.7767
## 95% CI : (0.7253, 0.8225)
## No Information Rate : 0.5033
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5533
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.7785
## Specificity : 0.7748
## Pos Pred Value : 0.7733
## Neg Pred Value : 0.7800
## Prevalence : 0.4967
## Detection Rate : 0.3867
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.7767
##
## 'Positive' Class : -0.985603470561783
##
## [1] "Sensitivity: 0" "Sensitivity: 0.774834437086093"
## [3] "Sensitivity: 1"
## [1] "Specificity: 1" "Specificity: 0.778523489932886"
## [3] "Specificity: 0"
## [1] "AUC: 0.776678963509489"
## [1] "Error rate:0.223333333333333"
## [1] "False positives percentage: 22.148%"
## [1] 22.14765
german.pca.first.twenty <- Get.First.n.PCs(german.pca, 20)
NB.Model.PCA(german.pca.first.twenty, dummy.german_std)
## NULL
## Confusion Matrix and Statistics
##
## Reference
## Prediction -0.985603470561783 1.01359221009092
## -0.985603470561783 120 24
## 1.01359221009092 29 127
##
## Accuracy : 0.8233
## 95% CI : (0.7754, 0.8648)
## No Information Rate : 0.5033
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6466
##
## Mcnemar's Test P-Value : 0.5827
##
## Sensitivity : 0.8054
## Specificity : 0.8411
## Pos Pred Value : 0.8333
## Neg Pred Value : 0.8141
## Prevalence : 0.4967
## Detection Rate : 0.4000
## Detection Prevalence : 0.4800
## Balanced Accuracy : 0.8232
##
## 'Positive' Class : -0.985603470561783
##
## [1] "Sensitivity: 0" "Sensitivity: 0.841059602649007"
## [3] "Sensitivity: 1"
## [1] "Specificity: 1" "Specificity: 0.805369127516778"
## [3] "Specificity: 0"
## [1] "AUC: 0.823214365082892"
## [1] "Error rate:0.176666666666667"
## [1] "False positives percentage: 19.463%"
## [1] 19.46309
NB.Model(RFE.10.Balanced.Dummy.German)
## NULL
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 149 1
## 1 0 150
##
## Accuracy : 0.9967
## 95% CI : (0.9816, 0.9999)
## No Information Rate : 0.5033
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9933
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 1.0000
## Specificity : 0.9934
## Pos Pred Value : 0.9933
## Neg Pred Value : 1.0000
## Prevalence : 0.4967
## Detection Rate : 0.4967
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9967
##
## 'Positive' Class : 0
##
## [1] "Sensitivity: 0" "Sensitivity: 0.993377483443709"
## [3] "Sensitivity: 1"
## [1] "Specificity: 1" "Specificity: 1" "Specificity: 0"
## [1] "AUC: 0.996688741721854"
## [1] "Error rate:0.00333333333333333"
## [1] "False positives percentage: 0%"
## [1] 0
KNN.Model.PCA <- function(pca.data, data) {
# Extract the principal component scores
pc_scores <- predict(pca.data, data)
# Split the data into training and testing sets
set.seed(123) # for reproducibility
train_index <- sample(nrow(data), nrow(data) * 0.7) # 70% for training
train_data <- pc_scores[train_index, ]
train_label <- as.factor(data[train_index,"Default"])
test_data <- pc_scores[-train_index, ]
test_label <- as.factor(data[-train_index,"Default"])
# Train the KNN classifier using the training data
knn_model <- train(
x = train_data,
y = train_label,
method = "knn",
trControl = trainControl(method = "cv", number = 10),
tuneGrid = data.frame(k = 1:30)
)
# Evaluate the model performance on the test set for each value of K
knnPred <- predict(knn_model, newdata=test_data)
knnConf <- confusionMatrix(knnPred, test_label)
# Choose the K that gives the lowest test error rate
kOpt <- knn_model$bestTune$k
# Plot the tuning parameter performance
gg <- ggplot(knn_model$results, aes(x=k, y=Accuracy)) +
geom_line() +
geom_point(size = 3) +
geom_vline(xintercept=kOpt, color="red", linetype="dashed") +
labs(title="Tuning Parameter Performance",
x="K",
y="Accuracy") +
theme_minimal()
print(gg)
knnPredictiction <- prediction(as.numeric(knnPred), as.numeric(test_label))
knnPerf <- performance(knnPredictiction, measure = "tpr", x.measure = "fpr")
knnAUC <- performance(knnPredictiction, measure = "auc")
print(plot(knnPerf))
# Report the model performance metrics for the optimal K
# Extract performance metrics
sensitivity <- slot(knnPerf, "y.values")[[1]]
specificity <- 1 - slot(knnPerf, "x.values")[[1]]
auc <- slot(knnAUC, "y.values")
knnError <- mean(as.numeric(knnPred) !=as.numeric(test_label))
# Print performance metrics
print(knnConf)
print(paste0("Sensitivity: ", sensitivity))
print(paste0("Specificity: ", specificity))
print(paste0("AUC: ", auc))
print(paste0("Optimal K:", kOpt))
print(paste0("Error rate:", knnError))
# Calculate false positives
false_positives <- sum(as.numeric(knnPred) == 2 & as.numeric(test_label) == 1)
# Calculate false positives as a percentage
total_negatives <- sum(as.numeric(test_label) == 1)
false_positives_percent <- false_positives / total_negatives * 100
# Print the false positives percentage
print(paste0("False positives percentage: ", round(false_positives_percent,3), "%"))
return(false_positives_percent)
}
# This function takes in non PCA like data.
KNN.Model <- function(data) {
# One-Hot Encoding Without Removing the First Column
dummy <- dummyVars(" ~ .", data=data)
dummy.data <- data.frame(predict(dummy, newdata = data))
# Standardize the data
dummy.data_std <- scale(dummy.data)
# Split the data into training and testing sets
set.seed(123) # for reproducibility
train_index <- sample(nrow(dummy.data_std), nrow(dummy.data_std) * 0.7) # 70% for training
train_data <- dummy.data_std[train_index, ]
train_label <- as.factor(dummy.data_std[train_index,"Default"])
test_data <- dummy.data_std[-train_index, ]
test_label <- as.factor(dummy.data_std[-train_index,"Default"])
# Train the KNN classifier using the training data
knn_model <- train(
x = train_data,
y = train_label,
method = "knn",
trControl = trainControl(method = "cv", number = 10),
tuneGrid = data.frame(k = 1:30)
)
# Evaluate the model performance on the test set for each value of K
knnPred <- predict(knn_model, newdata=test_data)
knnConf <- confusionMatrix(knnPred, test_label)
# Choose the K that gives the lowest test error rate
kOpt <- knn_model$bestTune$k
# Plot the tuning parameter performance
gg <- ggplot(knn_model$results, aes(x=k, y=Accuracy)) +
geom_line() +
geom_point(size = 3) +
geom_vline(xintercept=kOpt, color="red", linetype="dashed") +
labs(title="Tuning Parameter Performance",
x="K",
y="Accuracy") +
theme_minimal()
print(gg)
knnPredictiction <- prediction(as.numeric(knnPred), as.numeric(test_label))
knnPerf <- performance(knnPredictiction, measure = "tpr", x.measure = "fpr")
knnAUC <- performance(knnPredictiction, measure = "auc")
print(plot(knnPerf))
# Report the model performance metrics for the optimal K
# Extract performance metrics
sensitivity <- slot(knnPerf, "y.values")[[1]]
specificity <- 1 - slot(knnPerf, "x.values")[[1]]
auc <- slot(knnAUC, "y.values")
knnError <- mean(as.numeric(knnPred) !=as.numeric(test_label))
# Print performance metrics
print(knnConf)
print(paste0("Sensitivity: ", sensitivity))
print(paste0("Specificity: ", specificity))
print(paste0("AUC: ", auc))
print(paste0("Optimal K:", kOpt))
print(paste0("Error rate:", knnError))
# Calculate false positives
false_positives <- sum(as.numeric(knnPred) == 2 & as.numeric(test_label) == 1)
# Calculate false positives as a percentage
total_negatives <- sum(as.numeric(test_label) == 1)
false_positives_percent <- false_positives / total_negatives * 100
# Print the false positives percentage
print(paste0("False positives percentage: ", round(false_positives_percent,3), "%"))
return(false_positives_percent)
}
KNN.Model.PCA(german.pca, dummy.german_std)
## NULL
## Confusion Matrix and Statistics
##
## Reference
## Prediction -0.985603470561783 1.01359221009092
## -0.985603470561783 139 24
## 1.01359221009092 10 127
##
## Accuracy : 0.8867
## 95% CI : (0.8452, 0.9202)
## No Information Rate : 0.5033
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7735
##
## Mcnemar's Test P-Value : 0.02578
##
## Sensitivity : 0.9329
## Specificity : 0.8411
## Pos Pred Value : 0.8528
## Neg Pred Value : 0.9270
## Prevalence : 0.4967
## Detection Rate : 0.4633
## Detection Prevalence : 0.5433
## Balanced Accuracy : 0.8870
##
## 'Positive' Class : -0.985603470561783
##
## [1] "Sensitivity: 0" "Sensitivity: 0.841059602649007"
## [3] "Sensitivity: 1"
## [1] "Specificity: 1" "Specificity: 0.932885906040268"
## [3] "Specificity: 0"
## [1] "AUC: 0.886972754344638"
## [1] "Optimal K:24"
## [1] "Error rate:0.113333333333333"
## [1] "False positives percentage: 6.711%"
## [1] 6.711409
KNN.Model(RFE.4.German)
## NULL
## Confusion Matrix and Statistics
##
## Reference
## Prediction -0.654326261999973 1.52676127799994
## -0.654326261999973 200 6
## 1.52676127799994 4 90
##
## Accuracy : 0.9667
## 95% CI : (0.9396, 0.9839)
## No Information Rate : 0.68
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.923
##
## Mcnemar's Test P-Value : 0.7518
##
## Sensitivity : 0.9804
## Specificity : 0.9375
## Pos Pred Value : 0.9709
## Neg Pred Value : 0.9574
## Prevalence : 0.6800
## Detection Rate : 0.6667
## Detection Prevalence : 0.6867
## Balanced Accuracy : 0.9589
##
## 'Positive' Class : -0.654326261999973
##
## [1] "Sensitivity: 0" "Sensitivity: 0.9375" "Sensitivity: 1"
## [1] "Specificity: 1" "Specificity: 0.980392156862745"
## [3] "Specificity: 0"
## [1] "AUC: 0.958946078431372"
## [1] "Optimal K:2"
## [1] "Error rate:0.0333333333333333"
## [1] "False positives percentage: 1.961%"
## [1] 1.960784
KNN.Model(RFE.10.German)
## NULL
## Confusion Matrix and Statistics
##
## Reference
## Prediction -0.654326261999973 1.52676127799994
## -0.654326261999973 192 38
## 1.52676127799994 12 58
##
## Accuracy : 0.8333
## 95% CI : (0.7862, 0.8737)
## No Information Rate : 0.68
## P-Value [Acc > NIR] : 1.293e-09
##
## Kappa : 0.5875
##
## Mcnemar's Test P-Value : 0.000407
##
## Sensitivity : 0.9412
## Specificity : 0.6042
## Pos Pred Value : 0.8348
## Neg Pred Value : 0.8286
## Prevalence : 0.6800
## Detection Rate : 0.6400
## Detection Prevalence : 0.7667
## Balanced Accuracy : 0.7727
##
## 'Positive' Class : -0.654326261999973
##
## [1] "Sensitivity: 0" "Sensitivity: 0.604166666666667"
## [3] "Sensitivity: 1"
## [1] "Specificity: 1" "Specificity: 0.941176470588235"
## [3] "Specificity: 0"
## [1] "AUC: 0.772671568627451"
## [1] "Optimal K:10"
## [1] "Error rate:0.166666666666667"
## [1] "False positives percentage: 5.882%"
## [1] 5.882353
KNN.Model(RFE.10.Balanced.Dummy.German)
## NULL
## Confusion Matrix and Statistics
##
## Reference
## Prediction -0.985603470561783 1.01359221009092
## -0.985603470561783 147 3
## 1.01359221009092 2 148
##
## Accuracy : 0.9833
## 95% CI : (0.9615, 0.9946)
## No Information Rate : 0.5033
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9667
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9866
## Specificity : 0.9801
## Pos Pred Value : 0.9800
## Neg Pred Value : 0.9867
## Prevalence : 0.4967
## Detection Rate : 0.4900
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9834
##
## 'Positive' Class : -0.985603470561783
##
## [1] "Sensitivity: 0" "Sensitivity: 0.980132450331126"
## [3] "Sensitivity: 1"
## [1] "Specificity: 1" "Specificity: 0.986577181208054"
## [3] "Specificity: 0"
## [1] "AUC: 0.98335481576959"
## [1] "Optimal K:8"
## [1] "Error rate:0.0166666666666667"
## [1] "False positives percentage: 1.342%"
## [1] 1.342282